Applying our zero-inflated model to the olfactory data.
We select only the cells that pass QC, color coded by the experimental time point. I will focus on the top 1,000 most variable genes. Note that the data are not public, hence it should not work other than on Davide’s computer.
load("Expt4c2b_filtdata.Rda")
load("E4c2b_slingshot_wsforkelly.RData")
To speed up the computations, I will focus on the top 1,000 most variable genes.
names(batch) <- colnames(counts)
counts <- counts[,names(clus.labels)]
batch <- droplevels(batch[names(clus.labels)])
qc <- qc[names(clus.labels),]
vars <- rowVars(log1p(counts))
names(vars) <- rownames(counts)
vars <- sort(vars, decreasing = TRUE)
core <- counts[names(vars)[1:1000],]
First, let’s look at PCA (of the log counts) for reference.
par(mfrow=c(1, 2))
detection_rate <- colSums(core>0)
coverage <- colSums(core)
colMerged <- cc_rev[clus.labels]
pca <- prcomp(t(log1p(core)))
plot(pca$x, col=colMerged, pch=19, main="PCA of log-counts, centered not scaled")
fq <- EDASeq::betweenLaneNormalization(counts, which="full")
pcafq <- prcomp(t(log1p(fq)))
plot(pcafq$x, col=colMerged, pch=19, main="PCA of FQ log-counts, centered not scaled")
The first plot is the unnormalized data and the second plot is after full-quantile normalization, which is what Russell and Diya used for the paper.
They found that to fully explain the differences between clusters, we need three dimensions.
pairs(pcafq$x[,1:3], col=colMerged, pch=19, main="PCA of FQ log-counts, centered not scaled")
df <- data.frame(PC1=pcafq$x[,1], PC2=pcafq$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## PC1 PC2 detection_rate coverage
## PC1 1.00000000 -0.0432585 0.2643194 -0.02214266
## PC2 -0.04325850 1.0000000 0.6508453 0.19979644
## detection_rate 0.26431940 0.6508453 1.0000000 0.41994811
## coverage -0.02214266 0.1997964 0.4199481 1.00000000
Even after full-quantile normalization, there is some residual correlation between PC2 and detection rate.
library(scone)
library(rARPACK)
fastpca <- function(expr, scale=FALSE) {
svd_raw <- svds(scale(t(expr), center=TRUE, scale=scale), k=3, nu=3, nv=0)
pc_raw <- svd_raw$u %*% diag(svd_raw$d[1:3])
return(pc_raw)
}
raw <- as.matrix(counts)
totalcount = function (ei)
{
sums = colSums(ei)
eo = t(t(ei)*mean(sums)/sums)
return(eo)
}
tc <- totalcount(raw)
fq <- FQT_FN(raw)
tmm <- TMM_FN(raw)
vargenes <- rownames(core)
pc_raw <- fastpca(log1p(raw[vargenes,]))
pc_tc <- fastpca(log1p(tc[vargenes,]))
pc_fq <- fastpca(log1p(fq[vargenes,]))
pc_tmm <- fastpca(log1p(tmm[vargenes,]))
wrapRzifa <- function(Y, block = TRUE, k=2){
# wrapper R function for ZIFA.
# md5 hashing and temporary files are used not to re-run zifa
# if it has already be run on this computer.
d = digest(Y, "md5")
tmp = paste0(tempdir(), '/', d)
write.csv(Y, paste0(tmp, '.csv'))
if (!file.exists(paste0(tmp, "_", k, '_zifa.csv'))){
print('run ZIFA')
bb = ifelse(block, '-b ', '')
cmd = sprintf('python run_zifa.py -d %d %s%s.csv %s_%d_zifa.csv', k, bb, tmp, tmp, k)
system(cmd)
}
read.csv(sprintf("%s_%d_zifa.csv", tmp, k), header=FALSE)
}
zifa_raw <- wrapRzifa(log1p(raw[vargenes,]), k=3)
## [1] "run ZIFA"
zifa_tc <- wrapRzifa(log1p(tc[vargenes,]), k=3)
## [1] "run ZIFA"
zifa_tmm <- wrapRzifa(log1p(tmm[vargenes,]), k=3)
## [1] "run ZIFA"
zifa_fq <- wrapRzifa(log1p(fq[vargenes,]), k=3)
## [1] "run ZIFA"
print(system.time(zinb_Vall <- zinbFit(core, ncores = 3, K = 3)))
## user system elapsed
## 130.370 21.678 63.712
pairs(zinb_Vall@W, col=colMerged, pch=19, main="ZINB")
qcpca <- prcomp(qc, center=TRUE, scale.=TRUE)
df <- data.frame(W1=zinb_Vall@W[,1], W2=zinb_Vall@W[,2], W3=zinb_Vall@W[,3], gamma_mu = zinb_Vall@gamma_mu[1,], gamma_pi = zinb_Vall@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage, quality=qcpca$x[,1])
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## W1 W2 W3 gamma_mu
## W1 1.00000000 -0.16683191 -0.107484692 0.03024469
## W2 -0.16683191 1.00000000 0.056838063 0.01042277
## W3 -0.10748469 0.05683806 1.000000000 -0.17160625
## gamma_mu 0.03024469 0.01042277 -0.171606253 1.00000000
## gamma_pi 0.25935897 0.07017869 -0.014958757 -0.09570741
## detection_rate -0.33704112 -0.15008428 -0.027467546 0.22824345
## coverage 0.07794123 -0.04513153 0.007492650 0.90197020
## quality 0.11001325 0.09386843 -0.003560763 -0.55770491
## gamma_pi detection_rate coverage quality
## W1 0.25935897 -0.33704112 0.07794123 0.110013250
## W2 0.07017869 -0.15008428 -0.04513153 0.093868427
## W3 -0.01495876 -0.02746755 0.00749265 -0.003560763
## gamma_mu -0.09570741 0.22824345 0.90197020 -0.557704907
## gamma_pi 1.00000000 -0.96579839 -0.31884978 0.299364588
## detection_rate -0.96579839 1.00000000 0.41994811 -0.347477558
## coverage -0.31884978 0.41994811 1.00000000 -0.609447350
## quality 0.29936459 -0.34747756 -0.60944735 1.000000000
mod <- model.matrix(~qcpca$x[,1:5])
print(system.time(zinb_3 <- zinbFit(core, ncores = 3, K = 3, X=mod)))
## user system elapsed
## 206.298 30.046 92.629
pairs(zinb_3@W, col=colMerged, pch=19, main="ZINB")
#total number of detected genes in the cell
df <- data.frame(W1=zinb_3@W[,1], W2=zinb_3@W[,2], W3=zinb_3@W[,3], gamma_mu = zinb_3@gamma_mu[1,], gamma_pi = zinb_3@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## W1 W2 W3 gamma_mu
## W1 1.00000000 0.018099164 0.131838743 0.09537145
## W2 0.01809916 1.000000000 0.001966816 0.11518619
## W3 0.13183874 0.001966816 1.000000000 -0.03705229
## gamma_mu 0.09537145 0.115186186 -0.037052286 1.00000000
## gamma_pi -0.07415694 0.056694880 -0.130417497 -0.28456970
## detection_rate 0.14377846 -0.126369965 0.153647636 0.48846297
## coverage 0.02138167 0.124259640 0.106557525 0.87012468
## gamma_pi detection_rate coverage
## W1 -0.07415694 0.1437785 0.02138167
## W2 0.05669488 -0.1263700 0.12425964
## W3 -0.13041750 0.1536476 0.10655753
## gamma_mu -0.28456970 0.4884630 0.87012468
## gamma_pi 1.00000000 -0.9406926 -0.25076070
## detection_rate -0.94069259 1.0000000 0.41994811
## coverage -0.25076070 0.4199481 1.00000000
print(system.time(zinb_4 <- zinbFit(core, ncores = 3, K = 4)))
## user system elapsed
## 151.480 25.479 71.973
pairs(zinb_4@W, col=colMerged, pch=19, main="ZINB")
save(zinb_3, zinb_4, zinb_Vall, pc_tmm, pc_fq, pc_tc, pc_raw, zifa_fq, zifa_tmm, zifa_tc, zifa_raw, file="olfactory.rda")